Smart phones are a neccessity to every one in todays life, but what are the popular ones, what are the price? Which one you should choose? To gain inside to the current smart phone brand and models, we use Verizon wireless online store (https://www.verizonwireless.com/smartphones/) for our web scraping porject, to collect information in hope to get an answer.
Verizon is the biggest wireless service provider, their online store stocked 30+ smart phone models, 9 brands which have 30,000+ customer reviews. All of these will provide valuable information for our study.
from IPython.display import Image
Image(filename='verizonwireless scrapping.png')
from db import *
import pandas as pd
# read dataframe from SQL tables: review, product, summary
review = Review()
product = Product()
review_summary = Review_summary()
review_records = review.select()
product_records = product.select()
summary_records = review_summary.select()
# drop columns 'id' since we are using pandas index.
review_df = pd.DataFrame(list(review_records.dicts())).drop(columns='id')
product_df = pd.DataFrame(list(product_records.dicts())).drop(columns='id').drop_duplicates()
summary_df =pd.DataFrame(list(summary_records.dicts())).drop(columns='id')
# checking dataframe columns, sizes,
print('size of review dataframe: ',review_df.shape)
print(review_df.info())
print(review_df.head(2))
print('size product dataframe: ',product_df.shape)
print(product_df.info())
print(product_df.head(2))
print('phone models: ',product_df[['brand','price']].drop_duplicates().shape)
print('size review summary dataframe: ',summary_df.shape)
print(summary_df.info())
print(summary_df.head(2))
Trim phone model name to 20 characters and merge product_df to summary_df
pd.set_option('display.max_columns',20)
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams["figure.figsize"] = (10,6)
import seaborn as sns
# limit model name length to 20 characters
product_df['product_name']=product_df['product_name'].apply(lambda x: x[:20].strip())
# rename column 'storage' to 'memory'
product_df.columns = ['brand', 'color', 'price', 'product_name', 'skuId', 'spec_name', 'spec_value', 'memory', 'store']
# combine product dataframe with summary dataframe to create a new dataframe 'new_product'
new_product = product_df.merge(summary_df, how='right', on='skuId')
new_product.head(2)
# there are multiple entries for the same phone model but with different memory size,
# so a phone model price comes from the mean of difference memory price
# each spec_name and spec_value also takes a seperate entry for the same phone model
avg_price= pd.DataFrame(product_df.groupby(['brand','product_name'])['price'].mean() )
avg_price.columns = ['price']
avg_price = avg_price.reset_index().sort_values(by=['brand','price'], ascending=[True,False])
plt.figure(figsize=(6,8))
sns.barplot(data=avg_price, y='product_name',x='price',hue='brand', dodge=False, palette="colorblind")
plt.ylabel('phone model')
plt.xlabel('Price')
plt.title("Price vs Phone Models")
plt.legend(bbox_to_anchor=(1.05,1), loc=2, borderaxespad=0.)
# counting models from each brand
models = product_df[['brand','skuId']].drop_duplicates().groupby('brand').count().sort_values(by=['skuId'], ascending=False)
models.plot.bar()
plt.ylabel('Number of Models')
plt.title("Number of Models vs Brand")
# boxplot on price range for each brand
import seaborn as sns
price_brand= new_product.pivot_table(values='price', index=['brand','product_name'])
price_brand.index = price_brand.index.droplevel(level=1)
price_brand = price_brand.reset_index()
sns.boxplot(y='price',x='brand',data=price_brand, width=0.5, palette="colorblind")
plt.title("Phone Price vs Brand")
We consider popularities is connected to the number of reviews on a phone model. Apple has the most popular models (iPhone 6 and 7), next are LG G7, Samsung Galaxy S8 and Google Pixel 2 XL.
The Votes for "BUY" in the order:
Apple phones have the larger range across all the 5 rating group in regards to raw count, Samsung phones have a larger number of 1 ratings. But in the percentage boxplot, LG phones have the highest percentage ofd 5 ratings in its own group, and Motorola phones have the highest percentage of 1-rating.
# combine all three dataframes into a new 'df', keeps only the interested columns from new_product dataframe,
# remove models with review less than 10
product=new_product[['brand','skuId','product_name','design','display','feature','storage', 'batterylife']].drop_duplicates()
price_df = avg_price.reset_index().drop(columns=['brand'])
df = review_df.merge(product, how='left', on='skuId',suffixes=('_x', ''))
df = df.merge(price_df, how='left', on='product_name').groupby(
'skuId').filter(lambda x: x.shape[0]>10)
# convert 'recommending' column to 1 for yes, 0 for blank
df['recommending']=df.recommending.apply(lambda x: 1 if x=='True' else 0)
import numpy as np
# some models of Apple phones has very larger number of reviews compared to other models, need to view them in log scale
# count how many reviews per model, convert the count to log, plot log count in boxplot by group of brand
popular = df.groupby('brand')['product_name'].value_counts().apply(lambda x: np.log(x))
popular_df = pd.DataFrame(popular)
popular_df.index = popular_df.index.droplevel(1)
popular_df = popular_df.reset_index()
sns.boxplot(y='product_name', x='brand',data=popular_df,palette="colorblind")
plt.ylabel("Log of Reviews")
plt.title("Popularity (Log of Reviews) vs Brand")
# bar chart for Review numbers against phone model in review log scale
popular_df = pd.DataFrame(popular)
popular_df.columns=['log_review']
popular_df = popular_df.reset_index()
plt.figure(figsize=(10,8))
sns.barplot(data=popular_df,y='product_name', x='log_review',hue='brand', dodge=False)
plt.ylabel('phone model')
plt.title('Log of Review Numbers vs Phone Models')
# ratings
def filter_brand(x):
keep_list= ['brand','design', 'display', 'feature', 'storage', 'batterylife']
return (x['brand'] in ['Apple','Google','LG','Motorola', 'Samsung'])
# calculate 5, 4, 3, 2, 1 rating count, subset dataset on models in keep_list
# convert count into log
rating_df = pd.DataFrame(df.groupby(['brand','product_name'])['rating'].value_counts())
rating_df.columns=['count']
rating_df['count'] = rating_df['count'].apply(lambda x: np.log(x))
rating_df.index=rating_df.index.droplevel(level=1)
rating_df = rating_df.reset_index()
rating_df = rating_df.loc[rating_df.apply(filter_brand, axis=1)]
fig, axs = plt.subplots(1, 5, figsize=(5*5,5), subplot_kw={'aspect':'equal'})
# boxplot chart on brand group
sns.set(font_scale=1.2)
g = sns.catplot(kind='box',y='count', x='rating', col='brand', data=rating_df,legend=True, height=4, aspect=1)
g.set_titles("{col_name}" + " Rating Count in Log Scale")
g.set_axis_labels("Rating", "Count in Log")
# prepare the same data (without log conversion) to pie chart
# sorting and setting the index on rating column
df1=df[['brand','product_name','rating']]
df1 = df1.loc[df1.apply(filter_brand, axis=1)].groupby(['brand','rating']).count()
df1.columns=['count']
df1= df1.reset_index()
df1 = df1.sort_values(by=['rating','count'], ascending=[False,False]).set_index('rating')
N=rating_df['rating'].drop_duplicates().shape[0]
for ax,(groupname,subdf) in zip(axs,df1.groupby('brand')):
ax.pie(subdf['count'], labels=subdf.index, autopct='%.1f%%')
ax.set_title(groupname + " Rating (%)")
apple_df=df.loc[df['brand']=='Apple'][['product_name','rating']].groupby('product_name')['rating'].value_counts()
apple_df=pd.DataFrame(apple_df)
apple_df.columns=['count']
apple_df=apple_df.reset_index()
apple_df= apple_df.sort_values(by=['rating','count'], ascending=[False,False]).set_index('rating')
print(avg_price.loc[avg_price['brand']=='Apple'][['product_name','price']].set_index('product_name'))
sns.set(font_scale=1.1)
fig, axs = plt.subplots(3, 4, figsize=(4*4,4*3), subplot_kw={'aspect':'equal'})
idx=0
for (groupname,subdf) in apple_df.groupby('product_name'):
i,j = divmod(idx,4)
axs[i,j].pie(subdf['count'], labels=subdf.index, autopct='%.1f%%')
axs[i,j].set_title(groupname + " Rating (%)")
idx += 1
axs[2,2].set_visible(False)
axs[2,3].set_visible(False)
samsung_df=df.loc[df['brand']=='Samsung'][['product_name','rating']].groupby('product_name')['rating'].value_counts()
samsung_df=pd.DataFrame(samsung_df)
samsung_df.columns=['count']
samsung_df=samsung_df.reset_index()
samsung_df=samsung_df.sort_values(by=['rating','count'], ascending=[False,False]).set_index('rating')
print(avg_price.loc[avg_price['brand']=='Samsung'][['product_name','price']].set_index("product_name"))
fig, axs = plt.subplots(3, 3, figsize=(3*4,3*4), subplot_kw={'aspect':'equal'})
idx = 0
for (groupname,subdf) in samsung_df.groupby('product_name'):
i,j = divmod(idx, 3)
axs[i,j].pie(subdf['count'], labels=subdf.index, autopct='%.1f%%')
axs[i,j].set_title(groupname + " Rating (%)")
idx += 1
lg_df=df.loc[df['brand']=='LG'][['product_name','rating']].groupby('product_name')['rating'].value_counts()
lg_df=pd.DataFrame(lg_df)
lg_df.columns=['count']
lg_df=lg_df.reset_index()
lg_df=lg_df.sort_values(by=['rating','count'], ascending=[False,False]).set_index('rating')
print(avg_price.loc[avg_price['brand']=='LG'][['product_name','price']][:3].set_index('product_name'))
fig, axs = plt.subplots(1, 3, figsize=(4*3,4), subplot_kw={'aspect':'equal'})
for ax, (groupname,subdf) in zip(axs, lg_df.groupby('product_name')):
ax.pie(subdf['count'], labels=subdf.index, autopct='%.1f%%')
ax.set_title(groupname + " Rating (%)")
idx += 1
# subset helpfulness newer than year 2017, make strip chart on the top 10 models
helpfulness_gt2017 = df.loc[df['date_published']>'2017'][['product_name', 'helpful', 'unhelpful']]
models=helpfulness_gt2017.groupby('product_name').agg({'helpful':'sum',\
'unhelpful':'sum'}).sort_values(by=['helpful',\
'unhelpful'], ascending=False)[:10].index
helpfulness_gt2017=helpfulness_gt2017.loc[helpfulness_gt2017.apply(lambda x: x['product_name'] in models, axis=1)]
# melt helpful and unhelpful columns into single column named vote
helpfulness_gt2017 = helpfulness_gt2017.melt(id_vars=['product_name'], var_name='vote', \
value_name='vote number').sort_values(by=['product_name'])
# remove outlier
helpfulness_gt2017=helpfulness_gt2017.loc[helpfulness_gt2017['vote number']<1000]
# strip plot
sns.catplot(kind='strip', x='vote number', hue='vote', y='product_name',dodge=True, \
jitter=True,data=helpfulness_gt2017, height=6, aspect=1.2, palette="colorblind")
plt.title("Reviews were Voted Helpful and Unhelpful Since 2017")
plt.ylabel("Phone Model")
# scatter plot of helpfulness on 3 phone models 'Samsung Galaxy S10+', 'Apple iPhone 8', 'LG K20 V'
models = ['Samsung Galaxy S10+', 'Apple iPhone 8', 'LG K20 V']
helpfulness = df.loc[df['product_name'].apply(lambda x: x in models)][['product_name', 'helpful', 'unhelpful']]
sns.scatterplot(x='helpful', y='unhelpful', hue='product_name', data=helpfulness, s = 70)
plt.title("Helpful vs Unhelpful")
gp = df.groupby(['brand','product_name'])
#pct_recommend = gp['recommending'].apply(lambda x: sum(x ==1)/len(x)*100)
pct_recommend = pd.DataFrame(gp['recommending'].sum())
pct_recommend.columns = ['recommending']
pct_recommend = pct_recommend.loc[pct_recommend['recommending']>0].apply(lambda x: np.log(x), axis=1).reset_index()
pct_recommend = pct_recommend.sort_values(by=['brand','recommending'], ascending=[True, False])
#print(pct_recommend)
plt.figure(figsize=(6,8))
sns.barplot(y='product_name',x='recommending', hue='brand',data=pct_recommend, palette="colorblind", dodge=False)
plt.title('Buy Recommendation vs Models')
plt.ylabel("Phone Models")
plt.xlabel("'Buy' Recommendation Count in Log Scale")
plt.legend(bbox_to_anchor=(1.02,1.0), loc='upper left')
plot_list=['brand','product_name','design', 'display', 'feature', 'storage', 'batterylife']
sub_df = df.loc[df.apply(filter_brand, axis=1)][plot_list]
sub_df[plot_list[2:]] = sub_df[plot_list[2:]].apply(pd.to_numeric)
sub_df = sub_df.drop_duplicates()
sub_df = sub_df.melt(id_vars=['brand','product_name'],var_name='sub_group', value_name='rating')
#print(sub_df)
fig, axs = plt.subplots(nrows=5, ncols=1, figsize=(6,5*6), subplot_kw={'aspect':'equal'})
idx=0
for (groupname,subdf) in sub_df.groupby('brand'):
sns.barplot(y='product_name',x='rating',hue='sub_group', data=subdf, dodge=True, ax=axs[idx])
#ax.set_title(groupname)
axs[idx].legend(bbox_to_anchor=(1.01,1), loc=2, borderaxespad=0.)
idx +=1
#plt.title("Ratings on Design, Display, Feature, Storage and BatteryLife on Phone Brand")
df1 = df[['brand','product_name','price']].drop_duplicates()
df1 = df1.loc[df1.apply(filter_brand, axis=1)]
new_df = df.groupby(['brand','product_name'])['recommending'].apply(lambda x: np.mean(x)*100)
new_df = new_df.reset_index().merge(df1, how='right', on='product_name', suffixes=('', '_y')).drop(columns=['brand_y', 'product_name'])
sns.lineplot(x='price', y='recommending', hue='brand', data=new_df, palette="colorblind")
plt.title('Buy Recommendation vs Price')
plt.ylabel('Buy Recommendation')
#g.fig.set_size_inches(14,6)
import nltk
import re
df1=df.copy()
df1.columns
df1['text'] = df1['title'] + ' ' + df1['text']
pd.set_option('max_colwidth',200)
df1['text']=df1['text'].str.lower().apply(lambda x: re.sub('\s+', ' ', x))
#df1['text'].sample(10)
#remove punctuation
df1['text']=df1['text'].apply(lambda x: re.sub('[^\w\s]','',x))
from nltk.corpus import stopwords
stop = stopwords.words('english')
#stop.extend(['iphone', '6s','plus'])
#stop.extend(['im', 'ive', 'iphone', 'x','xs', 'max', 'phone', 'due','still','phones'])
#print(stop)
df1['text']=df1['text'].apply(lambda x: " ".join(word for word in x.split() if word not in stop))
putting review content of iPhone 7 and Sumsung Galaxy S8 into WordCloud, output shows what words used frequently in reveiws.
from wordcloud import WordCloud
wc = WordCloud(background_color="white", max_words=2000, width=800, height=400)
# generate word cloud
mask = df1['product_name']=='Apple iPhone 7'
iphone_7 = df1[mask]
wc.generate(' '.join(iphone_7['text']))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
#wc = WordCloud(background_color="white", max_words=20000, width=800, height=400)# generate word cloud
mask = df1['product_name']=='Samsung Galaxy S8'
samsung_S8 = df1[mask]
wc.generate(' '.join(samsung_S8['text']))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()